In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-2-064248b69272> in <module>()
----> 1 nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
      2 nsrdb.df.index = nsrdb.df.index.tz_convert('MST')

~/duramat/clearsky_detection/cs_detection.py in read_pickle(cls, filename, *args, **kwargs)
    219         """
    220         df = pd.read_pickle(filename)
--> 221         return cls(df, *args, **kwargs)
    222 
    223     def to_pickle(self, filename, overwrite=False):

~/duramat/clearsky_detection/cs_detection.py in __init__(self, df, copy, set_ghi_status, scale_col, time_from_noon)
     40             self.df['scale'] = 1.0e0
     41         elif scale_col == 'irrad_scaler':  # fix up
---> 42             self.scale_by_irrad(label='scale')
     43         else:
     44             raise RuntimeError('Argument for scale_col unrecognized.')

TypeError: scale_by_irrad() missing 1 required positional argument: 'col'
nsrdb.trim_dates('01-01-2014', '01-01-2016')
In [3]:
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
nsrdb.time_from_solar_noon_ratio2('Clearsky GHI pvlib')
In [4]:
feature_cols = [
'ghi_status',
'tfn',
'abs_ideal_ratio_diff',
'abs_ideal_ratio_diff mean',
'abs_ideal_ratio_diff std',
# 'abs_ideal_ratio_diff max',
# 'abs_ideal_ratio_diff min',
'GHI Clearsky GHI pvlib gradient ratio', 
'GHI Clearsky GHI pvlib gradient ratio mean', 
'GHI Clearsky GHI pvlib gradient ratio std', 
# 'GHI Clearsky GHI pvlib gradient ratio min', 
# 'GHI Clearsky GHI pvlib gradient ratio max', 
'GHI Clearsky GHI pvlib gradient second ratio', 
'GHI Clearsky GHI pvlib gradient second ratio mean', 
'GHI Clearsky GHI pvlib gradient second ratio std', 
# 'GHI Clearsky GHI pvlib gradient second ratio min', 
# 'GHI Clearsky GHI pvlib gradient second ratio max', 
'GHI Clearsky GHI pvlib line length ratio',
# 'GHI Clearsky GHI pvlib line length ratio gradient',
# 'GHI Clearsky GHI pvlib line length ratio gradient second'
]

target_cols = ['sky_status']

Train/test on NSRDB data to find optimal parameters

In [5]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [6]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
train.time_from_solar_noon_ratio2('Clearsky GHI pvlib')
train.scale_by_irrad('Clearsky GHI pvlib')
In [7]:
import warnings
In [10]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    params={}
    params['max_depth'] = [4, 5, 6]
    params['n_estimators'] = [32, 64, 128]
    params['class_weight'] = [None] # , 'balanced']
    # best_score = -1
    for depth, nest, cw in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight']):
        train2 = cs_detection.ClearskyDetection(train.df)
        train2.trim_dates('01-01-1999', '01-01-2014')
        utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
        test2 = cs_detection.ClearskyDetection(train.df)
        test2.trim_dates('01-01-2014', '01-01-2015')
        clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, n_jobs=-1)
        clf.fit(train2.df[feature_cols].values, train2.df[target_cols].values.flatten())
        pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
        f1_score = metrics.f1_score(test2.df['sky_status'], pred)
        recall_score = metrics.recall_score(test2.df['sky_status'], pred)
        precision_score = metrics.precision_score(test2.df['sky_status'], pred)
        print(f1_score, recall_score, precision_score, depth, nest, cw)
0.905871578305 0.925941335552 0.886653386454 4 32 None
0.905702647658 0.925109215727 0.887093556752 4 64 None
0.905871578305 0.925941335552 0.886653386454 4 128 None
0.906596198801 0.927813605159 0.886327503975 5 32 None
0.907078297959 0.929061784897 0.886111111111 5 64 None
0.906688351291 0.927813605159 0.886503677201 5 128 None
0.906815869786 0.92718951529 0.887318335656 6 32 None
0.907371631927 0.928229665072 0.887430389817 6 64 None
0.9074281069 0.928853754941 0.886968613429 6 128 None
## output from above cell ## search max_depth 4-7, n_estimators [32, 64, 128], class_weight [None, 'balanced'] 0.905814071887 0.925317245683 0.88711607499 4 32 None 0.897951332115 0.971083836072 0.835062611807 4 32 balanced 0.906055979644 0.925941335552 0.887006775608 4 64 None 0.902622087923 0.963178697732 0.849229640499 4 64 balanced 0.906875190413 0.928853754941 0.885912698413 4 128 None 0.900357591572 0.969003536509 0.840794223827 4 128 balanced 0.905852417303 0.925733305596 0.886807493025 4 256 None 0.899348439171 0.961930517995 0.844411979547 4 256 balanced 0.906856272219 0.928645724984 0.886065899166 5 32 None 0.911399767352 0.977948824631 0.853330913051 5 32 balanced 0.906504065041 0.927813605159 0.886151400755 5 64 None 0.911108947522 0.973372165592 0.856332357247 5 64 balanced 0.90590987692 0.926357395465 0.886345541401 5 128 None 0.913212687293 0.97628458498 0.85779564979 5 128 balanced 0.906040268456 0.926773455378 0.886214442013 5 256 None 0.911833398209 0.97462034533 0.8566465533 5 256 balanced 0.907537381752 0.928021635115 0.887937898089 6 32 None 0.912993039443 0.982317453713 0.852808379989 6 32 balanced 0.907335907336 0.928853754941 0.88679245283 6 64 None 0.912191794172 0.973580195548 0.858085808581 6 64 balanced 0.907298231348 0.928437695028 0.887099980123 6 128 None 0.912887248713 0.977740794674 0.856102003643 6 128 balanced 0.907132696606 0.928645724984 0.886593843098 6 256 None 0.912988894254 0.983357603495 0.852018745494 6 256 balanced 0.90777539998 0.926565425421 0.889732321215 7 32 None 0.913311500674 0.986270022883 0.850403587444 7 32 balanced 0.908185053381 0.929061784897 0.888225934765 7 64 None 0.913635047766 0.984813813189 0.852051835853 7 64 balanced 0.907518567504 0.927813605159 0.888092393469 7 128 None 0.914109245319 0.985229873102 0.852565256526 7 128 balanced 0.907759585071 0.928437695028 0.887982491047 7 256 None 0.913734939759 0.986061992927 0.851293103448 7 256 balanced
In [ ]:
 
In [ ]:
 
In [109]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
train.time_from_solar_noon_ratio2('Clearsky GHI pvlib')
train.scale_by_irrad('Clearsky GHI pvlib')
In [110]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [111]:
# f1
best_params = {'max_depth': 4, 'n_estimators': 128, 'class_weight': 'balanced'}
# recall
# best_params = {'max_depth': 6, 'n_estimators': 128, 'class_weight': 'balanced'}
# precision
# best_params = {'max_depth': 7, 'n_estimators': 32, 'class_weight': 'None}
In [112]:
clf = ensemble.RandomForestClassifier(**best_params, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[112]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [113]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
test.scale_by_irrad('Clearsky GHI pvlib')
In [114]:
%%time
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

CPU times: user 4.15 s, sys: 396 ms, total: 4.54 s
Wall time: 51.1 s
In [115]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [116]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [117]:
print(metrics.f1_score(test.df['sky_status'].values, pred))
0.906727197689

Train on all NSRDB data, test various freq of ground data

In [118]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
train.scale_by_irrad('Clearsky GHI pvlib')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[118]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [119]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [155]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [156]:
test.trim_dates('10-01-2015', '10-21-2015')
In [157]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.time_from_solar_noon_ratio2('Clearsky GHI pvlib')
test.scale_by_irrad('Clearsky GHI pvlib')
In [158]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [159]:
# test.df.loc[np.round(test.df['GHI'], 6) == 14.48218, 'GHI'] = 40
In [160]:
# test.df = test.df.resample('30T').apply(lambda x: x[len(x) // 2])
In [161]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

In [162]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [163]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'Method 1') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'Method 2') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Method 1+2') vis.show()
In [164]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred

test.df['nsrdb_sky'] = nsrdb_clear
test.df['ml_sky'] = pred
In [165]:
from plotly import tools as tls

def make_subplots(test, train, nrow, ncol, random_seed, width=800, height=1000):
    # nsrdb_clear = train2.df['sky_status'].values
    # ml_clear = pred

#     test.df['nsrdb_sky'] = nsrdb_clear
#     test.df['ml_sky'] = pred

    colors = {
        'blue': '#1f77b4',
        'orange': '#ff7f0e',
        'green': '#2ca02c',
        'red': '#d62728',
        'purple': '#9467bd',
        'brown': '#8c564b',
        'pink': '#e377c2',
        'gray': '#7f7f7f',
        'yellow': '#bcbd22',
        'teal': '#17becf'
    }

    ghi_line = {'color': colors['blue'], 'width': 1}
    ghics_line = {'color': colors['orange'], 'width': 1}
    ml_only = {'color': colors['green'], 'size': 6}
    nsrdb_only = {'color': colors['red'], 'size': 6}
    both = {'color': colors['purple'], 'size': 6}

    # nrow, ncol = 2, 2

    fig = tls.make_subplots(rows=nrow, cols=ncol, shared_xaxes=True, shared_yaxes=True, print_grid=True)
    
    fig['layout'].update(width=width, height=height)
    
    days = np.unique(test.df.index.date)[:nrow * ncol]
    np.random.seed(random_seed)
    days = np.random.permutation(days)

    for i, day in enumerate(days):
        if i == nrow * ncol: break
        legend = False
        # if i == 0: legend = True
        g = test.df[test.df.index.date == day]
        g = g.between_time('05:00:00', '19:00:00')
        g.index = range(len(g))
        trace0 = go.Scatter(x=g.index, y=g['GHI'], line=ghi_line, showlegend=legend, name='GHI')
        trace1 = go.Scatter(x=g.index, y=g['Clearsky GHI pvlib'], line=ghics_line, showlegend=legend, name='GHIcs')
        trace2 = go.Scatter(x=g[g['ml_sky'] & ~g['nsrdb_sky']].index, y=g[g['ml_sky'] & ~g['nsrdb_sky']]['GHI'], mode='markers', marker=ml_only, showlegend=legend, name='Method 1')
        trace3 = go.Scatter(x=g[~g['ml_sky'] & g['nsrdb_sky']].index, y=g[~g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=nsrdb_only, showlegend=legend, name='Method 2')
        trace4 = go.Scatter(x=g[g['ml_sky'] & g['nsrdb_sky']].index, y=g[g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=both, showlegend=legend, name='Method 1 & 2')
        col = (i % ncol) + 1
        row = (i // ncol) + 1
        print(i, row, col, day)
        traces = [trace0, trace1, trace2, trace3, trace4]
        for t in traces:
            fig.append_trace(t, row, col)

    iplot(fig)
In [166]:
make_subplots(test, train2, 8, 2, 0, width=1000, height=1200)
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]
[ (3,1) x1,y3 ]  [ (3,2) x2,y3 ]
[ (4,1) x1,y4 ]  [ (4,2) x2,y4 ]
[ (5,1) x1,y5 ]  [ (5,2) x2,y5 ]
[ (6,1) x1,y6 ]  [ (6,2) x2,y6 ]
[ (7,1) x1,y7 ]  [ (7,2) x2,y7 ]
[ (8,1) x1,y8 ]  [ (8,2) x2,y8 ]

0 1 1 2015-10-02
1 1 2 2015-10-07
2 2 1 2015-10-09
3 2 2 2015-10-10
4 3 1 2015-10-14
5 3 2 2015-10-05
6 4 1 2015-10-03
7 4 2 2015-10-15
8 5 1 2015-10-11
9 5 2 2015-10-08
10 6 1 2015-10-16
11 6 2 2015-10-12
12 7 1 2015-10-04
13 7 2 2015-10-01
14 8 1 2015-10-06
15 8 2 2015-10-13
In [167]:
probas = clf.predict_proba(test.df[feature_cols].values)
In [168]:
test.df['probas'] = 0
In [169]:
test.df['probas'] = probas[:, 1]
trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 12, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text='prob: ' + test.df['probas'].astype(str)) iplot([trace0, trace1, trace2])
In [170]:
visualize.plot_ts_slider_highligther(test.df, prob='probas')

15 min freq ground data

In [171]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [172]:
test.trim_dates('10-01-2015', '10-17-2015')
In [173]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [174]:
test.df = test.df[test.df.index.minute % 15 == 0]
# test.df = test.df.resample('15T').apply(lambda x: x[len(x) // 2])
In [175]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

In [176]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [177]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [178]:
nsrdb_clear = train2.df['sky_status'].astype(bool)
ml_clear = pred

test.df['nsrdb_sky'] = nsrdb_clear
test.df['nsrdb_sky'] = test.df['nsrdb_sky'].replace(np.nan, False)
test.df['ml_sky'] = pred

make_subplots(test, train2, 8, 2, 0, width=1000, height=1200)
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]
[ (3,1) x1,y3 ]  [ (3,2) x2,y3 ]
[ (4,1) x1,y4 ]  [ (4,2) x2,y4 ]
[ (5,1) x1,y5 ]  [ (5,2) x2,y5 ]
[ (6,1) x1,y6 ]  [ (6,2) x2,y6 ]
[ (7,1) x1,y7 ]  [ (7,2) x2,y7 ]
[ (8,1) x1,y8 ]  [ (8,2) x2,y8 ]

0 1 1 2015-10-02
1 1 2 2015-10-07
2 2 1 2015-10-09
3 2 2 2015-10-10
4 3 1 2015-10-14
5 3 2 2015-10-05
6 4 1 2015-10-03
7 4 2 2015-10-15
8 5 1 2015-10-11
9 5 2 2015-10-08
10 6 1 2015-10-16
11 6 2 2015-10-12
12 7 1 2015-10-04
13 7 2 2015-10-01
14 8 1 2015-10-06
15 8 2 2015-10-13
In [179]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')

10 min freq ground data

In [214]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [215]:
test.trim_dates('10-01-2015', '10-08-2015')
In [216]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [217]:
test.df = test.df[test.df.index.minute % 10 == 0]
In [218]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

In [219]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [220]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [221]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]

visualize.plot_ts_slider_highligther(test.df, prob='probas')
In [211]:
probas
Out[211]:
array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])
In [188]:
nsrdb_clear = train2.df['sky_status'].astype(bool)
ml_clear = pred

test.df['nsrdb_sky'] = nsrdb_clear
test.df['nsrdb_sky'] = test.df['nsrdb_sky'].replace(np.nan, False)
test.df['ml_sky'] = pred

make_subplots(test, train2, 8, 2, 0, width=1000, height=1200)
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]
[ (3,1) x1,y3 ]  [ (3,2) x2,y3 ]
[ (4,1) x1,y4 ]  [ (4,2) x2,y4 ]
[ (5,1) x1,y5 ]  [ (5,2) x2,y5 ]
[ (6,1) x1,y6 ]  [ (6,2) x2,y6 ]
[ (7,1) x1,y7 ]  [ (7,2) x2,y7 ]
[ (8,1) x1,y8 ]  [ (8,2) x2,y8 ]

0 1 1 2015-10-02
1 1 2 2015-10-07
2 2 1 2015-10-09
3 2 2 2015-10-10
4 3 1 2015-10-14
5 3 2 2015-10-05
6 4 1 2015-10-03
7 4 2 2015-10-15
8 5 1 2015-10-11
9 5 2 2015-10-08
10 6 1 2015-10-16
11 6 2 2015-10-12
12 7 1 2015-10-04
13 7 2 2015-10-01
14 8 1 2015-10-06
15 8 2 2015-10-13

5 min freq ground data

In [204]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [205]:
test.trim_dates('10-01-2015', '10-17-2015')
In [206]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [207]:
test.df = test.df[test.df.index.minute % 5 == 0]
In [208]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

In [209]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [210]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'Method 1') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'Method 2') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Method 1+2') vis.show()from plotly import tools as tls nsrdb_clear = train2.df['sky_status'] ml_clear = pred print(len(nsrdb_clear), len(test.df)) test.df['nsrdb_sky'] = nsrdb_clear test.df['ml_sky'] = pred colors = { 'blue': '#1f77b4', 'orange': '#ff7f0e', 'green': '#2ca02c', 'red': '#d62728', 'purple': '#9467bd', 'brown': '#8c564b', 'pink': '#e377c2', 'gray': '#7f7f7f', 'yellow': '#bcbd22', 'teal': '#17becf' } ghi_line = {'color': colors['blue']} ghics_line = {'color': colors['orange']} ml_only = {'color': colors['green']} nsrdb_only = {'color': colors['red']} both = {'color': colors['purple']} nrow, ncol = 3, 3 fig = tls.make_subplots(rows=nrow, cols=ncol, shared_xaxes=True, shared_yaxes=True, print_grid=True) for i, (name, g) in enumerate(test.df.groupby(test.df.index.date)): if i == nrow * ncol: break legend = False if i == 0: legend = True g = g.between_time('05:00:00', '19:00:00') g.index = range(len(g)) trace0 = go.Scatter(x=g.index, y=g['GHI'], line=ghi_line, showlegend=legend, name='GHI') trace1 = go.Scatter(x=g.index, y=g['Clearsky GHI pvlib'], line=ghics_line, showlegend=legend, name='GHIcs') trace2 = go.Scatter(x=g[g['ml_sky'] & ~g['nsrdb_sky']].index, y=g[g['ml_sky'] & ~g['nsrdb_sky']]['GHI'], mode='markers', marker=ml_only, showlegend=legend, name='Method 1') trace3 = go.Scatter(x=g[~g['ml_sky'] & g['nsrdb_sky']].index, y=g[~g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=nsrdb_only, showlegend=legend, name='Method 2') trace4 = go.Scatter(x=g[g['ml_sky'] & g['nsrdb_sky']].index, y=g[g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=both, showlegend=legend, name='Method 1 & 2') row = i % nrow + 1 col = i // ncol + 1 traces = [trace0, trace1, trace2, trace3, trace4] for t in traces: fig.append_trace(t, row, col) iplot(fig, layout)probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 10, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text=test.df['probas']) iplot([trace0, trace1, trace2])
In [196]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas

visualize.plot_ts_slider_highligther(test.df, prob='probas')

1 min freq ground data

In [229]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [230]:
test.trim_dates('10-01-2015', '10-17-2015')
In [231]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [232]:
test.df = test.df[test.df.index.minute % 1 == 0]
In [233]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:284: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:291: RuntimeWarning:

Scaling did not converge.

In [234]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [235]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 10, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text=test.df['probas']) iplot([trace0, trace1, trace2])
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas
visualize.plot_ts_slider_highligther(test.df, prob='probas')

Save model

In [ ]:
import pickle
In [ ]:
with open('abq_trained.pkl', 'wb') as f:
    pickle.dump(clf, f)
In [ ]:
!ls abq*

Conclusion

In general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.

In [ ]: